Konlpy basic - website(http://konlpy-ko.readthedocs.io/ko/v0.4.3/)



In [ ]:

    
from konlpy.tag import Kkma  # 꼬꼬마 형태소 분석기 사용

kkma = Kkma()
text = "오늘 서울의 날씨는 추워질 전망입니다. 오후 한때 소나기가 올 예정입니다. 아, 오늘은 좀 힘드네요...이런? 난 도대체 뭐지?! 뭐랄까? 뭐라는거니"

sentences = kkma.sentences(text)
for sentence in sentences:
    print(sentence)



In [ ]:

    
kkma.nouns(text)



In [ ]:

    
pos_tag = kkma.pos(text)
print(pos_tag[:5])

input 형태 = string



In [ ]:

    
# NLTK처럼 내장 데이터를 불러올 수도 있음

from konlpy.corpus import kolaw
fids = kolaw.fileids()
fids

실습 1 : ko_data에 대해 문장 단위로 나눠보고 Pos tagging 해보기 (시간 체크하여 출력해보기)



In [ ]:

    
# 실제 data 사용해서 분석해보기 - 인코딩 문제
with open('pgh-2015.txt', 'r') as f:
    lines = f.read().splitlines()
print(lines[:5])



In [ ]:

    
#'cp949' codec can't decode byte 0xed in position 6: illegal multibyte sequence 에러인 경우 운영체제의 default 인코딩과 utf-8과 달라서 발생
with open('pgh-2015.txt', 'r', encoding='utf-8') as f:
    lines = f.read().splitlines()
print(lines[:5])



In [ ]:

    
# 빈 문장 '' 제거
sentences = [line for line in lines if line != '']


for line in lines[:5]:
    if line != '':
        print(line)



In [ ]:

    
# komoran을 이용한 형태소 분석

from konlpy.tag import Komoran
tagger = Komoran()
tags = tagger.pos(sentences[0])

print(tags[:4])



In [ ]:

    
tagged_sentences = [tagger.pos(sent) for sent in sentences]
tagged_sentences[0]



In [ ]:

    
# 명사 리스트 만들어 보기
noun_list = []

for sent in tagged_sentences:    
    for word, tag in sent:
        if tag in ['NNP', 'NNG']:
            noun_list.append(word)
noun_list[:10]



In [ ]:

    
# collection library를 이용하여 빈도수 계산하기
from collections import Counter

noun_counts = Counter(noun_list)
noun_counts.most_common(20)

실습 2 : stop-words 리스트를 만들고 stop-words가 제거된 명사 리스트를 만들어보자

실습 3 : 만든 명사 리스트를 csv 형태로 저장해보자



In [ ]:

    
noun_list = []
stop_words = ['경제',"청년"]
for sent in tagged_sentences:    
    for word, tag in sent:
        if tag in ['NNP', 'NNG']:
            if word not in stop_words:
                noun_list.append(word)
#collecnoun_list



In [ ]:

    
import collections

noun_counts = collections.Counter(noun_list)
noun_counts.most_common(10) # '청년'과 '개혁'이 없어짐을 확인할 수 있음



In [ ]:

    
import nltk
import matplotlib.pyplot as plt # 결과를 시각화 하기 위한 matplotlib
%matplotlib inline

# word index 대신 word를 보여주는 그래프
freqdist = nltk.FreqDist(noun_counts)
freqdist.plot(50)
freqdist.plot(50,cumulative=True)



In [ ]:

    
# 폰트 해결
from matplotlib import font_manager, rc
font_fname = r'C:\Windows\Fonts\NGULIM.TTF'     # A font of your choice
font_name = font_manager.FontProperties(fname=font_fname).get_name()
rc('font', family=font_name)



In [ ]:

    
freqdist.plot(50)
freqdist.plot(50,cumulative=True)



In [ ]:

    
# unique한 명사 리스트 만들기

unique_nouns = set()
unique_list = []

for sent in tagged_sentences:
    for word, tag in sent:
        if tag in ['NNP','NNG']:
            if word not in unique_list:
                unique_list.append(word)
                
for sent in tagged_sentences:    
    for word, tag in sent:
        if tag in ['NNP', 'NNG']:
            unique_nouns.add(word)

unique_nouns = list(unique_nouns)
noun_index = {noun: i for i, noun in enumerate(unique_nouns)} # 딕셔너리 형태의 자료구조
noun_index

문장-단어 행렬



In [ ]:

    
import numpy as np
# 문장 길이 X 명사 종류 matrix 생성
occurs = np.zeros([len(tagged_sentences), len(unique_nouns)])
np.shape(occurs)



In [ ]:

    
for i, sent in enumerate(tagged_sentences):
    for word, tag in sent:
        if tag in ['NNP', 'NNG']:
            index = noun_index[word]  # 명사가 있으면, 그 명사의 인덱스를 index에 저정
            occurs[i][index] = 1  # 문장 i의 index 자리에 1을 채워 넣는다.
            
occurs[0]



In [ ]:

    
# 공존 단어 행렬 계산
# i 번째 단어
co_occurs = occurs.T.dot(occurs)



In [ ]:

    
for i in range(100):
    for j in range(100):
        if (co_occurs[i][j] > 1) & (i>j):
            print(unique_nouns[i], unique_nouns[j], co_occurs[i][j])

실습 3 : 새로운 텍스트 데이터에 대해 빈도 그래프, 문장-단어 행렬, 공존 단어 행렬을 계산해보자

네트워크 그리기



In [ ]:

    
import networkx as nx
graph = nx.Graph()

for i in range(len(unique_nouns)):
    for j in range(i + 1, len(unique_nouns)):
        if co_occurs[i][j] > 4:
            graph.add_edge(unique_nouns[i], unique_nouns[j])



In [ ]:

    
%matplotlib inline
import matplotlib.pyplot as plt



In [ ]:

    
krfont = {'family' : 'nanumgothic', 'weight' : 'bold', 'size'   : 10}
plt.rc('font',**krfont)



In [ ]:

    
plt.figure(figsize=(15, 15))
layout = nx.spring_layout(graph, k=.1)
nx.draw(graph, pos=layout, with_labels=True,
        font_size=20, font_family='Comic Sans MS',
        alpha=0.3, node_size=3000)
plt.show()

Konlpy basic - website(http://konlpy-ko.readthedocs.io/ko/v0.4.3/)

실습 1 : ko_data에 대해 문장 단위로 나눠보고 Pos tagging 해보기 (시간 체크하여 출력해보기)

실습 2 : stop-words 리스트를 만들고 stop-words가 제거된 명사 리스트를 만들어보자

실습 3 : 만든 명사 리스트를 csv 형태로 저장해보자

문장-단어 행렬

실습 3 : 새로운 텍스트 데이터에 대해 빈도 그래프, 문장-단어 행렬, 공존 단어 행렬을 계산해보자

네트워크 그리기

python 3에서 networkx 패키지는 한글이 깨지는 에러가 발생